"""
@Author: Bright
@File: semantic_split.py
@Time: 2025/9/29
@Desc: 语义分割
# pip install --quiet langchain_experimental langchain_openai
"""
from langchain_experimental.text_splitter import SemanticChunker

from model.MyOllamaEmbeddings import MyOllamaEmbeddings

embedd_model = MyOllamaEmbeddings(model="nomic-embed-text")
file_path = "../resource/knowledge.txt"
with open(file_path, encoding="UTF-8") as f:
    knowledge = f.read()
# text_splitter = SemanticChunker(embedd_model)
# 拆分的默认方式是基于百分位数。默认相似度，在源码中可查看 95%
text_splitter = SemanticChunker(embedd_model, breakpoint_threshold_type="percentile")

docs = text_splitter.create_documents([knowledge])
print(len(docs))
print(docs[0].page_content)
print("--" * 50)
print(docs[1].page_content)
